library(dplyr)
library(forcats)
library(vtable)
library(ggplot2)
library(plotly) # Optional
library(broom)
library(lmtest)
library(mice)Project Part 1
Group 9
Loading libraries
Loading dataset
dataset <- read.csv("SMARTc.csv", sep = ";") # Without missing valuesRe-encode the categorical variables
dataset <- mutate(dataset,
EVENT = factor(EVENT),
EVENT = fct_recode(EVENT, "no" = "0", "yes" = "1"),
SEX = factor(SEX),
SEX = fct_recode(SEX, "male" = "1", "female" = "2"),
DIABETES = factor(DIABETES),
DIABETES = fct_recode(DIABETES, "no" = "0", "yes" = "1"),
SMOKING = factor(SMOKING),
SMOKING = fct_recode(SMOKING, "never" = "1", "former" = "2", "current" = "3"),
alcohol = factor(alcohol),
alcohol = fct_recode(alcohol, "never" = "1", "former" = "2", "current" = "3"),
CEREBRAL = factor(CEREBRAL),
CEREBRAL = fct_recode(CEREBRAL, "no" = "0", "yes" = "1"),
CARDIAC = factor(CARDIAC),
CARDIAC = fct_recode(CARDIAC, "no" = "0", "yes" = "1"),
AAA = factor(AAA),
AAA = fct_recode(AAA, "no" = "0", "yes" = "1"),
PERIPH = factor(PERIPH),
PERIPH = fct_recode(PERIPH, "no" = "0", "yes" = "1"),
albumin = factor(albumin),
albumin = fct_recode(albumin, "no" = "1", "micro" = "2", "macro" = "3"),
STENOSIS = factor(STENOSIS),
STENOSIS = fct_recode(STENOSIS, "no" = "0", "yes" = "1"),
)Description of the dataset and table of variables
The dataset is about cardiovascular health. It contains two outcomes : EVENT and TEVENT, the presence of cardiovascular events and the number of days the patient is in study until the event occurs. The dataset contains many variables, some of them are categorical and some of them are numerical. It covers patient descriptives, classical risk factors, previous symptomatic atherosclerosis, and markers of atherosclerosis.
sumtable(dataset, out = "return", add.median = TRUE)Association between variables and the outcome
avg_event_proportion <- mean(as.numeric(dataset$EVENT == "yes"))
bar_plot <- ggplot(dataset, aes(x = SMOKING, fill = EVENT)) +
geom_bar(position = "fill") +
geom_hline(yintercept = avg_event_proportion, linetype = "dashed") +
labs(
title = "Cardiovascular Event by Smoking Status",
x = "Smoking Status", y = "Proportion (-- : Average)",
fill = "Cardiovascular Event"
) +
coord_flip()
ggplotly(bar_plot)This bar plot shows the proportion of cardiovascular events by smoking status. The dashed line represents the average proportion of cardiovascular events in the dataset. The proportion of cardiovascular events is higher for former smokers, even higher than the average.
boxplot <- ggplot(dataset, aes(x = as.factor(EVENT), y = AGE)) +
geom_boxplot(fill = "lightblue") +
labs(x = "EVENT", y = "AGE", title = "Boxplot of age by event")
ggplotly(boxplot)Logistic regression model
fit <- glm(EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES +
HISTCAR2 + HOMOC + log(CREAT) + STENOSIS + IMT + SMOKING +
alcohol + albumin, data = dataset, family = "binomial")
tidy(fit)The logistic regression model shows that the variables associated with the lowest p-values are HISTCAR2, HDL and AGE. The other variables have higher p-values, which means they are less associated with the outcome.
anova(fit)conf_intervals <- confint(fit, trace = FALSE, test = c("LRT", "Rao"))Waiting for profiling to be done...
conf_intervals_df <- as.data.frame(conf_intervals)
conf_intervals_dfconf_intervals_df$Variable <- rownames(conf_intervals_df)
colnames(conf_intervals_df) <- c("Lower", "Upper", "Variable")
conf_intervals_df <- conf_intervals_df[conf_intervals_df$Variable != "(Intercept)", ]
conf_intervals_plot <- ggplot(conf_intervals_df, aes(x = Variable, ymin = Lower, ymax = Upper)) +
geom_errorbar(width = 0.2) +
geom_point(aes(y = (Lower + Upper) / 2)) +
coord_flip() +
labs(
title = "Confidence Intervals for Logistic Regression Coefficients",
x = "Variable",
y = "Confidence Interval"
) +
theme_minimal()
conf_intervals_plotImputation of missing values
dataset <- read.csv("SMART.csv", sep = ";") # With missing valuesdataset_imputed <- mice(dataset, m = 5, seed = 123)
dataset_imputed <- complete(dataset_imputed)fit_imputed <- with(dataset_imputed, glm(EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES +
HISTCAR2 + HOMOC + log(CREAT) + STENOSIS + IMT + SMOKING +
alcohol + albumin, data = dataset, family = "binomial"))
tidy(fit_imputed)Comparison between the logistic regression model with and without imputation of missing values :
anova(fit_imputed)conf_intervals_imputed <- confint(fit_imputed, trace = FALSE, test = c("LRT", "Rao"))Waiting for profiling to be done...
conf_intervals_imputed_df <- as.data.frame(conf_intervals_imputed)
conf_intervals_imputed_dfconf_intervals_imputed_df$Variable <- rownames(conf_intervals_imputed_df)
colnames(conf_intervals_imputed_df) <- c("Lower", "Upper", "Variable")
conf_intervals_imputed_df <- conf_intervals_imputed_df[conf_intervals_imputed_df$Variable != "(Intercept)", ]
conf_intervals_imputed_plot <- ggplot(conf_intervals_imputed_df, aes(x = Variable, ymin = Lower, ymax = Upper)) +
geom_errorbar(width = 0.2) +
geom_point(aes(y = (Lower + Upper) / 2)) +
coord_flip() +
labs(
title = "Confidence Intervals for Logistic Regression Coefficients (Imputed)",
x = "Variable",
y = "Confidence Interval"
) +
theme_minimal()
conf_intervals_imputed_plotPlot of comparison between the two models
conf_intervals_df$Model <- "Without Imputation"
conf_intervals_imputed_df$Model <- "With Imputation"
conf_intervals_combined <- rbind(conf_intervals_df, conf_intervals_imputed_df)
conf_intervals_combined_plot <- ggplot(conf_intervals_combined, aes(x = Variable, ymin = Lower, ymax = Upper, color = Model)) +
geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.5,alpha = 0.5) +
geom_point(aes(y = (Lower + Upper) / 2), alpha = 0.5) +
coord_flip() +
labs(
title = "Confidence Intervals for Logistic Regression Coefficients",
x = "Variable",
y = "Confidence Interval"
) +
theme_minimal()
conf_intervals_combined_plotFinal part
# Load the dataset
Patients_C <- read.csv("SMART.csv", sep = ";")
# Categorical variable conversions
Patients_C <- Patients_C %>%
mutate(
SEX = factor(SEX) %>% fct_recode("male" = "1", "female" = "2"),
DIABETES = factor(DIABETES) %>% fct_recode("No" = "0", "Yes" = "1"),
SMOKING = factor(SMOKING) %>% fct_recode("never" = "1", "former" = "2", "current" = "3"),
alcohol = factor(alcohol) %>% fct_recode("never" = "1", "former" = "2", "current" = "3"),
CEREBRAL = factor(CEREBRAL) %>% fct_recode("No" = "0", "Yes" = "1"),
CARDIAC = factor(CARDIAC) %>% fct_recode("No" = "0", "Yes" = "1"),
AAA = factor(AAA) %>% fct_recode("No" = "0", "Yes" = "1"),
PERIPH = factor(PERIPH) %>% fct_recode("No" = "0", "Yes" = "1"),
albumin = factor(albumin) %>% fct_recode("no" = "1", "micro" = "2", "macro" = "3"),
STENOSIS = factor(STENOSIS) %>% fct_recode("No" = "0", "Yes" = "1")
)
# Final pre-processing
final_part_c <- Patients_C %>%
mutate(log_CREAT = log(CREAT)) %>%
select(-c(HISTCARD, CREAT)) # Exclude these columns# Impute missing values with 'mice' - generate 5 imputed datasets
imputed_data <- mice(final_part_c, m = 5, maxit = 5, method = 'pmm', seed = 123)
iter imp variable
1 1 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
1 2 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
1 3 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
1 4 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
1 5 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
2 1 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
2 2 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
2 3 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
2 4 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
2 5 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
3 1 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
3 2 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
3 3 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
3 4 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
3 5 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
4 1 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
4 2 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
4 3 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
4 4 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
4 5 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
5 1 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
5 2 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
5 3 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
5 4 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
5 5 DIABETES STENOSIS SYSTBP DIASTBP SYSTH DIASTH LENGTH WEIGHT BMI CHOL HDL LDL TRIG HOMOC GLUT IMT albumin SMOKING packyrs alcohol log_CREAT
Warning: Number of logged events: 525
# Fit logistic regression model on imputed datasets
log_reg_model <- with(imputed_data, glm(EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES + HISTCAR2 + HOMOC + log_CREAT + STENOSIS + IMT + SMOKING + alcohol + albumin,
family = binomial))
# Pool the results from the 5 imputed datasets
pooled_results <- pool(log_reg_model)
# View the pooled summary
summary(pooled_results)